knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE
)
library(patchwork)
library(GGally)
library(dplyr)
library(tidyverse)
library(cowplot)
library(plotly)
corruption = read_csv('.\\data\\corruption_EDA.csv')
corruption = corruption|>
  mutate(
    corruption_index = as.numeric(corruption_index
),
government_effectiveness = as.numeric(government_effectiveness
),
political_stability_and_absence_of_violence_terrorism
=as.numeric(political_stability_and_absence_of_violence_terrorism
),
regulatory_quality
=as.numeric(regulatory_quality
),
rule_of_law
=as.numeric(rule_of_law
),
voice_and_accountability
=as.numeric(voice_and_accountability
)
  )
cpi_year = read_csv('.\\data\\cpi_data_year.csv')
corruption|> 
  group_by(year)|>
  summarise(mean = mean(signif(as.numeric(corruption_index),4),na.rm = TRUE))|>
  ggplot(aes(x=year, y=mean)) +
  geom_point(color = 'red')+geom_line(aes(group=1),color = 'red')

cpi_year|> 
  pivot_longer(
    '2005':'2022',
    values_to = 'cpi',
    names_to = 'year'
  )|>
  group_by(year)|>
  summarise(mean = mean(cpi,na.rm=TRUE))|>
  ggplot(aes(x=year, y=mean)) +
  geom_point(color = 'red')+geom_line(aes(group=1),color = 'red')

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = as.numeric(government_effectiveness), y = as.numeric(corruption_index),9))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption and effectiveness",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = signif(as.numeric(political_stability_and_absence_of_violence_terrorism
),4), y = signif(as.numeric(corruption_index),4)))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption and stability",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = signif(as.numeric(regulatory_quality

),4), y = signif(as.numeric(corruption_index),4)))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption and regulatory quality",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = signif(as.numeric(rule_of_law


),4), y = signif(as.numeric(corruption_index),4)))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption and regulatory quality",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = signif(as.numeric(voice_and_accountability


),4), y = signif(as.numeric(corruption_index),4)))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption against voice and accountability",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

corruption|>
  group_by(year,development)|>
  summarize(mean_cpi = mean(signif(as.numeric(corruption_index),4),na.rm = TRUE))|>
  ggplot(aes(x = year,y = mean_cpi, group = development,color = development)) +
  geom_point(shape=19,size = 3)+geom_line()

development_prop = corruption|>
  group_by(development)|>
  summarize(count = n_distinct(country_name))

library(RColorBrewer)
myPalette <- brewer.pal(3, "Set2")
pie(pull(development_prop,count) , labels = c("Dveloped","Developing","Least Developed"), border="white", col=myPalette )

mean_cpi = corruption|>
  group_by(continent)|>
  summarize(mean_cpi = mean(as.numeric(corruption_index),na.rm=TRUE))|>
  ggplot(aes(x = continent,y = mean_cpi,fill = continent))+
  geom_bar(stat='identity')

cpi_density = 
  corruption|>
  filter(year == 2022) |>
  group_by(continent)|>
  ggplot(
    aes(
      x = corruption_index
    )
  ) + 
  geom_density(aes(fill = continent), alpha = 0.15) + 
  labs(
    x = "cpi",
    y = "Density"
  )
cpi_continent = mean_cpi+cpi_density
cpi_continent

mean_gdp = corruption|>
  group_by(continent)|>
  summarize(gdp = mean(as.numeric(gdp),na.rm=TRUE))|>
  ggplot(aes(x = continent,y = gdp,fill = continent))+
  geom_bar(stat='identity')

gdp_density = corruption|>
    filter(year == 2022) |>
  group_by(continent)|>
  ggplot(
    aes(
      x = gdp
    )
  ) + 
  geom_histogram(aes(fill = continent), alpha = 0.15) + 
  labs(
    x = "gdp",
    y = "Density"
  )
gdp_density

gdp_continent = mean_gdp + gdp_density
gdp_continent

library(ggplot2)
library(dplyr)
library(tidyr)
library(forcats)
library(hrbrthemes)
library(viridis)
corruption |>
  filter(year == 2022)|>
  select(continent,population)|>
    ggplot( aes(x=continent, y=population, fill=continent, color=continent)) +
    geom_boxplot() +
    scale_fill_viridis(discrete=TRUE) +
    scale_color_viridis(discrete=TRUE) +
    theme_ipsum() +
    theme(
      legend.position="none"
    ) 

corruption %>%
  ggplot( aes(x=continent, y=population, fill=continent)) +
    geom_boxplot() +
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_jitter(color="red", size=0.4, alpha=0.9) +
    theme_ipsum() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)
    ) +
    ggtitle("population of countries") +
    xlab("")

myplots <- vector('list', 4)
for (i in 2019:2022){
    myplots[[i-2018]] =
      corruption %>% 
      filter(year == i) %>% 
      ggplot(aes(x = gdp/population, y = as.numeric(corruption_index),9))+geom_point()+geom_smooth(method = 'lm', se = TRUE, color = 'red'
                                                                                                                      )+
      labs(title = sprintf("year %s", i))
}

plot_row1 <- plot_grid(myplots[[1]], myplots[[2]])
plot_row2 <- plot_grid(myplots[[3]], myplots[[4]])
# title
title <- ggdraw() + 
  draw_label(
    "Scatter Plot for corruption and effectiveness",
    fontface = 'bold',
    x = 0,
    hjust = 0
  ) +
  theme(
    # add margin on the left of the drawing canvas,
    # so title is aligned with left edge of first plot
    plot.margin = margin(0, 0, 0, 7)
  )
plot_grid(title,plot_row1,plot_row2 ,ncol=1,  label_size = 12,rel_heights=c(0.1, 1,1))

normalize <- function(x, na.rm = TRUE) {
    return((x- min(x)) /(max(x)-min(x)))
}
corruption |> 
  filter(year == 2022)|>
  select(corruption_index,country_name,population,gdp)
## # A tibble: 214 × 4
##    corruption_index country_name        population    gdp
##               <dbl> <chr>                    <dbl>  <dbl>
##  1          -1.18   Afghanistan           41128771  NA   
##  2          -0.408  Albania                2775634  18.9 
##  3          -0.638  Algeria               44903225 192   
##  4           1.27   American Samoa           44273  NA   
##  5           1.27   Andorra                  79824   3.35
##  6          -0.602  Angola                35588987 107   
##  7           1.27   Anguilla                    NA  NA   
##  8           0.311  Antigua and Barbuda      93763   1.76
##  9          -0.447  Argentina             46234830 633   
## 10           0.0280 Armenia                2780469  19.5 
## # ℹ 204 more rows
pop = pull(corruption,population)
gdp = pull(corruption,gdp)
cpi = pull(corruption,corruption_index)

pop = scale(pop, center = min(pop,na.rm = TRUE), scale = max(pop,na.rm = TRUE) - min(pop,na.rm = TRUE))
gdp = normalize(gdp)
cpi = scale(as.numeric(cpi))
corruption
## # A tibble: 5,136 × 15
##    country_name country_code  year corruption_index government_effectiveness
##    <chr>        <chr>        <dbl>            <dbl>                    <dbl>
##  1 Afghanistan  AFG           1996            -1.29                   -2.18 
##  2 Afghanistan  AFG           1998            -1.18                   -2.10 
##  3 Afghanistan  AFG           2000            -1.27                   -2.17 
##  4 Afghanistan  AFG           2002            -1.25                   -1.59 
##  5 Afghanistan  AFG           2003            -1.34                   -1.18 
##  6 Afghanistan  AFG           2004            -1.35                   -0.945
##  7 Afghanistan  AFG           2005            -1.45                   -1.23 
##  8 Afghanistan  AFG           2006            -1.45                   -1.47 
##  9 Afghanistan  AFG           2007            -1.61                   -1.44 
## 10 Afghanistan  AFG           2008            -1.67                   -1.53 
## # ℹ 5,126 more rows
## # ℹ 10 more variables:
## #   political_stability_and_absence_of_violence_terrorism <dbl>,
## #   regulatory_quality <dbl>, rule_of_law <dbl>,
## #   voice_and_accountability <dbl>, gdp <dbl>, continent <chr>,
## #   development <chr>, latitude <dbl>, longitude <dbl>, population <dbl>
dev_pop = corruption |>
  mutate(
    pop = case_when(
      population > 100000000~3,
      population >30000000~2,
      .default = 1
    ),
    development = case_when(
      development == 'Least Developed' ~ 1,
      development == 'Developing' ~ 2,
      development == 'Developed'  ~ 3,

    )
  )|>
  group_by(pop,development)|>
  summarize(cpi = mean(as.numeric(corruption_index),na.rm = TRUE))

pop = pull(dev_pop,pop)
dev = pull(dev_pop,development)
cpi = pull(dev_pop,cpi)

ma_dev_pop = matrix(
  c(rep(0,9)),
  nrow = 3,
  ncol = 3,
  byrow = TRUE
)

for (i in 1:3){
  for(j in 1:3){
    
    ma_dev_pop[i,j] = cpi[3*(i-1)+j]
  }

}

fig <- plot_ly(z = ~ ma_dev_pop)
fig <- fig %>% add_surface()
fig
log_pop = corruption |>
  filter(year==2022)|>
  mutate(gdp = round(10*log(gdp),0))|>
  mutate(population = 4*round(1*log(population),0))|>
  mutate(corruption_index =41+ round(10*log(as.numeric(corruption_index)),0))|>
  drop_na()|>
  select(country_name,gdp,population,corruption_index)
  
ma_log = matrix(
  c(rep(0,110^2)),
  nrow = 110,
  ncol = 110,
  byrow = TRUE
)

pop = pull(log_pop,population)
gdp = pull(log_pop,gdp)
cpi = pull(log_pop,as.numeric(corruption_index))
356493
## [1] 356493
for (i in 1:length(pop)){
  ma_log[pop[i],gdp[i]] = cpi[i]
  print(cpi[i])
}
## [1] 43
## [1] 29
## [1] 5
## [1] 47
## [1] 43
## [1] 43
## [1] 21
## [1] 44
## [1] 45
## [1] 37
## [1] 43
## [1] 41
## [1] 46
## [1] 41
## [1] 0
## [1] 33
## [1] 22
## [1] 32
## [1] 37
## [1] 50
## [1] 35
## [1] 45
## [1] 32
## [1] 49
## [1] 43
## [1] 36
## [1] 47
## [1] 8
## [1] 34
## [1] 46
## [1] 45
## [1] 46
## [1] 39
## [1] 35
## [1] 45
## [1] 15
## [1] 27
## [1] 38
## [1] 20
## [1] 37
## [1] 38
## [1] 47
## [1] 40
## [1] 27
## [1] 27
## [1] 31
## [1] 32
## [1] 34
## [1] 26
## [1] 36
## [1] 48
## [1] 49
## [1] 48
## [1] 12
## [1] 34
## [1] 38
## [1] 19
## [1] 39
## [1] 0
## [1] 35
## [1] 35
## [1] 28
## [1] 31
## [1] 46
## [1] 48
## [1] 26
## [1] 38
## [1] 37
## [1] 32
## [1] 36
## [1] 38
## [1] 48
## [1] 48
## [1] 37
## [1] 42
## [1] 46
## [1] 42
## [1] 46
fig <- plot_ly(z = ~ ma_log)
fig <- fig %>% add_surface()
fig
corruption %>% 
  select(corruption_index
, political_stability_and_absence_of_violence_terrorism
, government_effectiveness
, regulatory_quality,rule_of_law,voice_and_accountability
,continent) %>% 
  ggpairs(
    title = "Correlations Between Key factors",
    subtitle = "By Continents",
    ggplot2::aes(alpha = 0.1)
  ) + 
  scale_fill_discrete() + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))